Demo - 18 - Insurance Bill¶

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as pl
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# import warnings
# warnings.filterwarnings('ignore')
In [59]:
data = pd.read_csv('insurance.csv')
data.head(5)
Out[59]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [55]:
data.describe(include='all')
Out[55]:
age sex bmi children smoker region charges
count 1338.000000 1338 1338.000000 1338.000000 1338 1338 1338.000000
unique NaN 2 NaN NaN 2 4 NaN
top NaN male NaN NaN no southeast NaN
freq NaN 676 NaN NaN 1064 364 NaN
mean 39.207025 NaN 30.663397 1.094918 NaN NaN 13270.422265
std 14.049960 NaN 6.098187 1.205493 NaN NaN 12110.011237
min 18.000000 NaN 15.960000 0.000000 NaN NaN 1121.873900
25% 27.000000 NaN 26.296250 0.000000 NaN NaN 4740.287150
50% 39.000000 NaN 30.400000 1.000000 NaN NaN 9382.033000
75% 51.000000 NaN 34.693750 2.000000 NaN NaN 16639.912515
max 64.000000 NaN 53.130000 5.000000 NaN NaN 63770.428010
In [56]:
data.isnull().sum()
Out[56]:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
In [57]:
data.isna().sum()
Out[57]:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Label Encoding¶

In [60]:
#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates()) 
data.sex = le.transform(data.sex)
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Gender/Sex label mapping", le_sex_mapping)

# smoker or not
le.fit(data.smoker.drop_duplicates()) 
data.smoker = le.transform(data.smoker)
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Smoker label mapping", le_smoker_mapping)

#region
le.fit(data.region.drop_duplicates()) 
data.region = le.transform(data.region)
le_region_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Region label mapping", le_region_mapping)
Gender/Sex label mapping {'female': 0, 'male': 1}
Smoker label mapping {'no': 0, 'yes': 1}
Region label mapping {'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}
In [45]:
data.corrwith(data['charges']).sort_values(ascending=False)
Out[45]:
charges     1.000000
smoker      0.787251
age         0.299008
bmi         0.198341
children    0.067998
sex         0.057292
region     -0.006208
dtype: float64
In [46]:
# Alternatively
data.corr()['charges'].sort_values(ascending=False)
Out[46]:
charges     1.000000
smoker      0.787251
age         0.299008
bmi         0.198341
children    0.067998
sex         0.057292
region     -0.006208
Name: charges, dtype: float64
In [47]:
data
Out[47]:
age sex bmi children smoker region charges
0 19 0 27.900 0 1 3 16884.92400
1 18 1 33.770 1 0 2 1725.55230
2 28 1 33.000 3 0 2 4449.46200
3 33 1 22.705 0 0 1 21984.47061
4 32 1 28.880 0 0 1 3866.85520
... ... ... ... ... ... ... ...
1333 50 1 30.970 3 0 1 10600.54830
1334 18 0 31.920 0 0 0 2205.98080
1335 18 0 36.850 0 0 2 1629.83350
1336 21 0 25.800 0 0 3 2007.94500
1337 61 0 29.070 0 1 1 29141.36030

1338 rows × 7 columns

Heatmap¶

In [9]:
f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, square=True, ax=ax, annot=True, cmap='mako')
Out[9]:
<Axes: >
No description has been provided for this image

Distribution of charges¶

In [10]:
sns.displot(data, x="charges", height=8)
Out[10]:
<seaborn.axisgrid.FacetGrid at 0x16c5b7370>
No description has been provided for this image
In [11]:
sns.displot(data, x="charges", height=8, log_scale=True)
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x16c48eb50>
No description has been provided for this image

Charge on smoker and non-smoker¶

In [12]:
f= pl.figure(figsize=(16,6))

ax=f.add_subplot(121)
sns.histplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers')

ax=f.add_subplot(122)
sns.histplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')
Out[12]:
Text(0.5, 1.0, 'Distribution of charges for non-smokers')
No description has been provided for this image
In [13]:
# Log-scale
f= pl.figure(figsize=(16,6))

ax=f.add_subplot(121)
sns.histplot(data[(data.smoker == 1)]["charges"],
             color='c', ax=ax, log_scale=True,)
ax.set_title('Distribution of charges for smokers')

ax=f.add_subplot(122)
sns.histplot(data[(data.smoker == 0)]['charges'],
             color='b', ax=ax, log_scale=True,)
ax.set_title('Distribution of charges for non-smokers')
Out[13]:
Text(0.5, 1.0, 'Distribution of charges for non-smokers')
No description has been provided for this image

Patient gender count - Smoker and non-smoker¶

In [14]:
# 0:female 1:male
# 0:non-smoker 1:smoker
In [15]:
import matplotlib.pyplot as plt
f = sns.catplot(x="smoker", kind="count",hue = 'sex', 
            palette="pink", data=data, legend=False)
plt.legend(title='Smoker', loc='upper right',
           labels=['Female', 'Male'])
plt.show(f)
No description has been provided for this image
In [16]:
le.inverse_transform(data['smoker'])
Out[16]:
array(['northwest', 'northeast', 'northeast', ..., 'northeast',
       'northeast', 'northwest'], dtype=object)

Distribution on charge – smoker vs non smoker¶

In [17]:
# 0:female 1:male
# 0:non-smoker 1:smoker
sns.boxplot(x="sex", y="charges", hue="smoker",
            palette=["g", "m"], data=data,)
Out[17]:
<Axes: xlabel='sex', ylabel='charges'>
No description has been provided for this image

Box plot for charges of men¶

In [62]:
# 0: non-smoker;   1: smoker
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 1)] , 
            orient="h", palette = 'magma', hue="smoker")
Out[62]:
<Axes: title={'center': 'Box plot for charges of men'}, xlabel='charges', ylabel='smoker'>
No description has been provided for this image

Box plot for charges of women¶

In [61]:
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 0)] , orient="h", 
            palette = 'rainbow', hue='smoker' )
Out[61]:
<Axes: title={'center': 'Box plot for charges of women'}, xlabel='charges', ylabel='smoker'>
No description has been provided for this image

Distribution of age¶

In [20]:
pl.figure(figsize=(12,5))
pl.title("Distribution of age")
ax = sns.histplot(data["age"], color = 'g')
No description has been provided for this image

The number of smokers and non-smokers (18 years old)¶

In [21]:
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", 
            data=data[(data.age == 18)])
pl.title("The number of smokers and non-smokers (18 years old)")
Out[21]:
Text(0.5, 1.0, 'The number of smokers and non-smokers (18 years old)')
No description has been provided for this image

18 years old - a very young age. Does smoking affect the cost of treatment at this age?¶

In [22]:
pl.figure(figsize=(12,5))
pl.title("Box plot for charges 18 years old smokers")
sns.boxplot(y="smoker", x="charges", data = data[(data.age == 18)] , orient="h", hue='smoker', palette = 'pink')
Out[22]:
<Axes: title={'center': 'Box plot for charges 18 years old smokers'}, xlabel='charges', ylabel='smoker'>
No description has been provided for this image

Distribution of charges and age for non-smokers¶

In [23]:
g = sns.jointplot(
    data=data[(data.smoker == 0)],  x="age", y="charges",
    kind="kde", color="g")
ax.set_title('Distribution of charges and age for non-smokers')
Out[23]:
Text(0.5, 1.0, 'Distribution of charges and age for non-smokers')
No description has been provided for this image

Distribution of charges and age for smokers¶

In [24]:
g = sns.jointplot(
    data=data[(data.smoker == 1)],  x="age", y="charges",
    kind="kde", color="grey")
ax.set_title('Distribution of charges and age for smokers')
Out[24]:
Text(0.5, 1.0, 'Distribution of charges and age for smokers')
No description has been provided for this image

Non smoker charge distribution on age¶

In [25]:
fig = px.scatter(data[(data.smoker == 0)], x="age", y="charges",
             log_x=True, size_max=60, title='Non smoker charge distribution on age')
fig.show()

Smoker charge distribution on age¶

In [26]:
fig = px.scatter(data[(data.smoker == 1)], x="age", y="charges", 
                  log_x=True, size_max=60, 
                 title='Smoker charge distribution on age')
fig.show()

Smokers and non-smokers on age¶

In [27]:
sns.lmplot(x="age", y="charges", hue="smoker", data=data, height=6 )
ax.set_title('Smokers and non-smokers')
Out[27]:
Text(0.5, 1.0, 'Smokers and non-smokers')
No description has been provided for this image

Distribution of bmi¶

In [28]:
pl.figure(figsize=(12,5))
pl.title("Distribution of bmi")
ax = sns.histplot(data["bmi"], color = 'm')
No description has been provided for this image

Distribution of charges for patients with BMI greater than 30¶

In [29]:
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI greater than 30")
ax = sns.histplot(data[(data.bmi >= 30)]['charges'], color = 'r')
No description has been provided for this image

Distribution of charges for patients with BMI less than 30¶

In [30]:
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI less than 30")
ax = sns.histplot(data[(data.bmi < 30)]['charges'], color = 'b')
No description has been provided for this image

Scatter plot of charges and bmi¶

In [31]:
pl.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')

sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma')
Out[31]:
<seaborn.axisgrid.FacetGrid at 0x303b61730>
No description has been provided for this image
No description has been provided for this image

Child patient count with how many sibling¶

In [32]:
sns.catplot(x="children", kind="count", palette="ch:.25", data=data, hue='children')
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x303a9c910>
No description has been provided for this image

Smokers and non-smokers who have childrens¶

In [33]:
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
            data=data[(data.children > 0)])
ax.set_title('Smokers and non-smokers who have childrens')
Out[33]:
Text(0.5, 1.0, 'Smokers and non-smokers who have childrens')
No description has been provided for this image
In [ ]: